【Agenda】

/ Part I - 前置準備
/ Part II - 分析產品在特定月份佔全年度的銷售比例
/ Part III - 巴西物流分析
/ Part IV - 購買分析
/ Part V - 下訂時間與品項分析
/ 附錄 - 下單至購買被確認的等待時間對上訂單金額的關係

【Part I】前置準備


1. 載入本次會用到的 packages

library(dplyr)
library(ggplot2)
library(d3heatmap)
library(ROCR)
library(googleVis)
library(chorddiag)
library(plotly)
library(maps)
library(brazilmaps)
library(ggmap)
library(mapproj)

2. 讀取檔案

- 由於是合併多人的筆記檔案,因此額外備份了讀進來的 data.frame,以便後續程式碼連接。


setwd("/Users/allenlin/Documents/學校資料/2B_統計、機率與商業數據分析/Group5/Brazil")
Customers <- read.csv("olist_customers_dataset.csv", stringsAsFactors = F)
Geolocation <- read.csv("olist_geolocation_dataset.csv", stringsAsFactors = F)
Items <- read.csv("olist_order_items_dataset.csv", stringsAsFactors = F)
Payments <- read.csv("olist_order_payments_dataset.csv", stringsAsFactors = F)
Reviews <- read.csv("olist_order_reviews_dataset.csv", stringsAsFactors = F)
Orders <- read.csv("olist_orders_dataset.csv", stringsAsFactors = F)
Products <- read.csv("olist_products_dataset.csv", stringsAsFactors = F)
Sellers <- read.csv("olist_sellers_dataset.csv", stringsAsFactors = F)
Trans <- read.csv("product_category_name_translation.csv", stringsAsFactors = F)

C2 <- Customers
G2 <- Geolocation
I2 <- Items
PA2 <- Payments
R2 <- Reviews
O2 <- Orders
PR2 <- Products
S2 <- Sellers
T2 <- Trans

【Part II】分析產品在特定月份佔全年度的銷售比例


1. 將資料表中產品種類的葡萄牙文換成英文

temp <- full_join(Products, Trans, by = "product_category_name" ) %>% .[,-2] %>% .[,c(1,9,2:8)]

2. 合併表格

aa = select(Orders,order_id,order_purchase_timestamp)
bb = select(Items,order_id,price)
cc <- full_join(aa, bb, by = "order_id" )
dd = full_join(Items, cc , by = "order_id" )
ee <- full_join(dd,temp,by = "product_id" )
all_data = mutate(ee,when = format(as.POSIXct(dd$order_purchase_timestamp),"%H"))
all_data_2017 <- all_data[format(as.POSIXct(all_data$order_purchase_timestamp), "%Y") == "2017", ]

3. 將 all_data 的資料分別取出一至十二月份資料

Jan=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="01"
Feb=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="02"
Mar=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="03"
Apr=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="04"
May=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="05"
Jun=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="06"
Jul=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="07"
Aug=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="08"
Sep=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="09"
Oct=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="10"
Nov=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="11"
Dec=format(as.POSIXct(all_data_2017$order_purchase_timestamp),"%m")=="12"

my_choice = table(all_data_2017$product_category_name_english) %>% sort() %>% .[. > 100] %>% names()
all_data_by_choice = all_data_2017[all_data_2017$product_category_name_english %in% my_choice, ]
all_data_by_choice$product_category_name_english = all_data_by_choice$product_category_name_english %>% as.character()

Jan_data = all_data_by_choice[Jan,]
Feb_data = all_data_by_choice[Feb,]
Mar_data = all_data_by_choice[Mar,]
Apr_data = all_data_by_choice[Apr,]
May_data = all_data_by_choice[May,]
Jun_data = all_data_by_choice[Jun,]
Jul_data = all_data_by_choice[Jul,]
Aug_data = all_data_by_choice[Aug,]
Sep_data = all_data_by_choice[Sep,]
Oct_data = all_data_by_choice[Oct,]
Nov_data = all_data_by_choice[Nov,]
Dec_data = all_data_by_choice[Dec,]

4. 把一月和四月的缺漏項補起來

temp <- 0
names(temp) <- "kitchen_dining_laundry_garden_furniture"
Jan_data_ = table(Jan_data$product_category_name_english) %>% c(., temp) %>% .[sort(names(.))]

temp <- 0
names(temp) <- "agro_industry_and_commerce"
Apr_data_ = table(Apr_data$product_category_name_english) %>% c(., temp) %>% .[sort(names(.))]

5. 建立特定月份 / 整年度的各產品品項表格

all_data_ = 
cbind(
(Jan_data_)/table(all_data_by_choice$product_category_name_english),
(table(Feb_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(table(Mar_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(Apr_data_)/table(all_data_by_choice$product_category_name_english),
(table(May_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(table(Jun_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(table(Jul_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(table(Aug_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(table(Sep_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(table(Oct_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(table(Nov_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)),
(table(Dec_data$product_category_name_english) / table(all_data_by_choice$product_category_name_english)))
gplot = ggplot(all_data_2017,aes(x=when,fill=product_category_name_english))+
  geom_bar(position="fill")
ggplotly(gplot)

6.1. 特定月份 / 整年度的各產品品項熱圖 - 藍色版本

colnames(all_data_) =c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
as.table(all_data_) %>% 
  as.data.frame.matrix %>% 
  d3heatmap(F,F,col=colorRamp(c('lightyellow','skyblue','darkblue')))
all_data_2017$order_purchase_timestamp = as.POSIXct(all_data_2017$order_purchase_timestamp, format = "%Y-%m-%d %T")
all_data_2017 = mutate(all_data_2017, bym = format(all_data_2017$order_purchase_timestamp, "%m"))
all_data_2017 = mutate(all_data_2017, byday = format(all_data_2017$order_purchase_timestamp, "%m-%d"))

6.2. 特定月份 / 整年度的各產品品項熱圖 - 橘色版本

colnames(all_data_) =c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec")
as.table(all_data_) %>% 
  as.data.frame.matrix %>% 
  d3heatmap(F,F,col=colorRamp(c('lightyellow','wheat','orangered')))

7. 畫出月份趨勢圖

ggplot(all_data_2017,aes(x=bym, fill = bym))+geom_bar()+xlab("各個月份")+ylab("產品訂單數量")+ theme(text=element_text(family="黑體-繁 中黑", size=14))+
theme_light()

8. 畫出十一月每一天的趨勢圖

ggplot(all_data_2017[all_data_2017$bym=="11",],aes(x=byday,fill=byday))+geom_bar()+xlab("十一月份的每一天")+ylab("產品訂單數量")+ theme(text=element_text(family="黑體-繁 中黑", size=14))+theme_light()


【Part III】巴西物流分析


0. 將備用資料取出來,初始化 data.frame

- 同時將訂單資料中的時間資料格式轉換為 POSIXct 格式。


Customers <- C2
Geolocation <- G2
Items <- I2
Payments <- PA2
Reviews <- R2
Orders <- O2
Products <- PR2
Sellers <- S2
Trans <- T2

for ( i in 4:8 ) {
  Orders[,i] <- as.POSIXct(Orders[,i], format = "%Y-%m-%d %T")
}

# %T = %H:%M:%S

1. 整理 data.frame

- 補上單筆訂單的總價

- 將訂單金額、支付方式的表格 Payments併進訂單表 Orders

- 補上 waiting_time : 等待訂單被確認所花費的時間

- 補上 logis_time : 顧客下單後至收到商品所花費的物流時間

- 將產品類別翻譯為英文

- 將 Orders 併進 Items 表格之中

- 補上 When : 是在一天內的哪一個小時做交易

- 補上 Weekday : 是在一週間的星期幾做交易

- 將 Products 併進 Items 表格之中


Payments <- group_by(Payments, order_id) %>% summarise(., Total_payments = sum(payment_value) ) %>%
  full_join(Payments, ., by = "order_id")

Orders <- left_join(Orders, Payments, by = "order_id")

Orders <- difftime(Orders[,5],Orders[,4], units = "days") %>% mutate(Orders, waiting_time = .)
Orders <- difftime(Orders[,7],Orders[,4], units = "days") %>% mutate(Orders, logis_time = .)

Products <- full_join(Products, Trans, by = "product_category_name" ) %>% .[,-2] %>% .[,c(1,9,2:8)]

Items <- left_join(Items, Orders, by = "order_id")
Items$When <- Items$order_purchase_timestamp %>% format(., "%H")
Items$Weekday <- Items$order_purchase_timestamp %>% weekdays(.)

Items <- full_join(Items, Products, by = "product_id")

UniqueOrders <- Orders[!duplicated(Orders$order_id),] # 附錄要用的

2. 利用產品類別來篩選樣本

- 另建一個新的表格 Items_filtered 來存篩選過後的樣本

- 篩選的條件為「只留下該商品類別總訂單超過 100 筆的樣本」

- 目的是希望在做接下來的類別分析時

- 樣本數量不會小到讓人對於結果存疑


my_filter <- table(Items$product_category_name_english) %>% sort() %>% .[. > 100] %>% names()
Items_filtered <- Items[(Items$product_category_name_english %in% my_filter), ]

- 繪製熱圖

- 以一天內 24 小時的每小時為區間,依每類別訂單數量繪製熱圖

- 用來繪製的數據參照是該小時該類別的訂單數相較於每小時平均是其幾倍

- 並以標準差做熱圖的 row 排序,由上至下為標準差低到高

- 意謂著上方的類別為整天訂單數表現較為穩定者,下方反之


Table_cat_when <- table(Items_filtered$product_category_name_english, Items_filtered$When)
Table_perform <- (Table_cat_when / rowMeans(Table_cat_when))
Table_perform[match((apply(Table_perform, 1, sd) %>% sort %>% names()),rownames(Table_perform)),] %>%
  as.data.frame.matrix %>% 
  d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red')))

3.1. 物流資料探勘及表格整理

- 平均物流天數為 12.48 天


mean(Items$logis_time, na.rm = T)
## Time difference of 12.48485 days

- 因為接下來是要分析、繪製物流相關的資料

- 因此要先將 Customers 以及 Sellers 表都併到 Items 表之中

- 同時新增變數 logistic : 賣家所在州 - 買家所在州


Items <- left_join(Items, Customers, by = "customer_id") %>% left_join(., Sellers, by = "seller_id")

Items$logistic <- paste(Items$seller_state, " - ", Items$customer_state, sep = "")

- 列出熱門路線排行 Top 30


table(Items$logistic) %>% sort(., decreasing = T) %>% head(., 30)
## 
## SP - SP SP - RJ SP - MG SP - RS SP - PR PR - SP MG - SP SP - BA SP - SC 
##   37736   10231    9059    4343    3825    3577    3111    2885    2833 
## MG - MG SP - GO SP - ES SP - DF RJ - SP SC - SP MG - RJ SP - PE SP - CE 
##    1778    1778    1713    1643    1602    1592    1404    1347    1184 
## PR - RJ RJ - RJ PR - MG PR - PR SP - MT SP - PA RS - SP PR - RS RJ - MG 
##    1177    1163     965     851     808     808     792     724     634 
## SP - MA SP - MS SC - RJ 
##     581     580     566

- 先整理物流狀態的表格

- 將物流的州移動路線整理成彙總表


flow_table <- table(Items$seller_state, Items$customer_state)
flow_table <- rbind(flow_table, 0) %>% rbind(., 0) %>% rbind(., 0) %>% rbind(., 0)
rownames(flow_table)[24:27] <- setdiff(colnames(flow_table), rownames(flow_table))

3.2. 各路線物流訂單排行 bar chart

- 以賣家地區為根據計算訂單量,並降冪排列


rowSums(flow_table) %>% sort(., decreasing = F) %>%
  barplot(., xlab = "Amount of Orders", ylab = "States",
          horiz = T, col = "Brown", main = "The distribution of Orders",
          cex.lab = 1, cex.axis = 0.8 )

3.3. 各州物流流向 Chorddiag 環狀圖

- 原先是取出前八名來做進一步的分析 (Chorddiag function 的上限)

- 爾後由於觀察到第七名開始訂單急劇下降,取出前六名來做更進一步的分析


Top8 <- rowSums(flow_table) %>% sort(., decreasing = T) %>% .[1:8] %>% names()
Top6 <- Top8[1:6]

flow_table <- flow_table[rownames(flow_table) %in% Top6, colnames(flow_table) %in% Top6]
flow_table <- as.table(flow_table)
flow_table %>% as.data.frame.matrix() %>% as.matrix() %>% round(3) %>% 
  chorddiag(.)

3.4. 巴西買賣家地區熱圖

- 全部的地區都有繪製,無篩選訂單量

- 且為了讓指數成長的各地區訂單量不要影響到圖型表現而取了 log()

- 縱軸是賣家 / 橫軸是買家


table(Items$seller_state, Items$customer_state) %>% + 1 %>% log() %>%
  as.data.frame.matrix %>% 
  d3heatmap(F,F,col=colorRamp(c('lightyellow','wheat','orangered')),
            labRow = paste("From_", rownames(.), sep =""),
            labCol = paste("To_", colnames(.), sep =""))

3.5. 巴西買賣家地區分佈地圖

- 先自 map_data 中取出世界地圖座標資料,再篩出巴西的資料

- 在 Items 表格上補上地理資訊資料

- Both 買家以及賣家資料

- 利用 zip code 去對應所在座標

- 並將緯度超越國界的去除


Brazil <- map_data("world") %>% filter(region == "Brazil")
Brazil$group <- as.factor(Brazil$group)

colnames(Geolocation)[1] <- "customer_zip_code_prefix"
Geolocation <- Geolocation[!duplicated(Geolocation$customer_zip_code_prefix), ]
Items <- left_join(Items, Geolocation[,1:3], by = "customer_zip_code_prefix")
colnames(Items)[c(40,41)] <- c("cus_lat","cus_lng")

colnames(Geolocation)[1] <- "seller_zip_code_prefix"
Items <- left_join(Items, Geolocation[,1:3], by = "seller_zip_code_prefix")
colnames(Items)[c(42,43)] <- c("sel_lat","sel_lng")

Items_InBrazil <- Items[(Items$cus_lat <= 5.27438888),]
Items_InBrazil <- Items_InBrazil[(Items_InBrazil$sel_lat <= 5.27438888),]

- 繪製賣家分佈地圖


ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="orange", alpha = 0.2) +
  geom_point(data= Items_InBrazil, aes(x = sel_lng, y = sel_lat, color = seller_state), size = 0.2, alpha = 0.3) +
  theme_void()
## Warning: Removed 572 rows containing missing values (geom_point).

- 繪製買家分佈地圖


ggplot() +
  geom_polygon(data = Brazil, aes(x=long, y = lat, group = group), fill="orange", alpha = 0.2) +
  geom_point(data= Items_InBrazil, aes(x = cus_lng, y = cus_lat, color = customer_state), size = 0.2, alpha = 0.3) +
  theme_void()
## Warning: Removed 572 rows containing missing values (geom_point).

3.6. 來自聖保羅州之買、賣家物流統計數據

- 進一步運算 Items 中的變數,生成更多衍生變數

- 以每一個 group 為單位,分組的方式是根據買家及賣家所在地區

- 可視每一個 group 為一種 path

- 為了資料的完整性,有篩選掉出現 NA 值的樣本


Logis_table <- Items %>% filter(., customer_state %in% Top6, seller_state %in% Top6, !is.na(payment_value), !is.na(logis_time)) %>%
  group_by(., customer_state, seller_state) %>% 
  summarise(
  noClients = n(),
  totalRev = sum(payment_value),           # 該 path 訂單總營收
  avgItemsSold = mean(order_item_id),      # 該 path 訂單平均商品訂購量
  avgPrice = totalRev/sum(order_item_id),  # 該 path 訂單平均商品單價
  avgRev = mean(payment_value),            # 該 path 訂單平均營收額
  avgFreight = mean(freight_value),        # 該 path 訂單平均物流費用
  avgLogis = mean(logis_time)              # 該 path 訂單平均物流耗時
  )

- 當買家是來自聖保羅州時


Logis_table[Logis_table$customer_state == "SP",]
## # A tibble: 6 x 9
## # Groups:   customer_state [1]
##   customer_state seller_state noClients totalRev avgItemsSold avgPrice
##   <chr>          <chr>            <int>    <dbl>        <dbl>    <dbl>
## 1 SP             MG                3042  489318.         1.18     136.
## 2 SP             PR                3501  662707.         1.19     159.
## 3 SP             RJ                1554  330106.         1.16     183.
## 4 SP             RS                 779  162306.         1.16     180.
## 5 SP             SC                1571  279828.         1.20     148.
## 6 SP             SP               36934 5218221.         1.22     116.
## # … with 3 more variables: avgRev <dbl>, avgFreight <dbl>,
## #   avgLogis <time>

- 當賣家是來自聖保羅州時


Logis_table[Logis_table$seller_state == "SP",]
## # A tibble: 6 x 9
## # Groups:   customer_state [6]
##   customer_state seller_state noClients totalRev avgItemsSold avgPrice
##   <chr>          <chr>            <int>    <dbl>        <dbl>    <dbl>
## 1 MG             SP                8919 1387532.         1.20     130.
## 2 PR             SP                3766  627963.         1.22     136.
## 3 RJ             SP                9935 1734182.         1.20     146.
## 4 RS             SP                4281  671378.         1.20     130.
## 5 SC             SP                2793  479726.         1.23     139.
## 6 SP             SP               36934 5218221.         1.22     116.
## # … with 3 more variables: avgRev <dbl>, avgFreight <dbl>,
## #   avgLogis <time>

【Part IV】購買分析


0. 將備用資料取出來,初始化 data.frame

Customers <- C2
Items <- I2
Orders <- O2
Products <- PR2
Trans <- T2

colnames(Trans)[1] = "product_category_name"

1. 整理資料

- 轉換商品名稱 (英文)

- 合併資料表


Products=merge(Products, Trans)
rm(Trans)
Products=select(Products,product_id,product_category_name_english)

AllData=merge(Customers,Orders) %>% merge(.,Items) %>% merge(.,Products) 
AllData=AllData[c(1,2,9,17,19)]

- 轉換時間格式,並且新增「星期幾」的欄位


AllData$order_purchase_timestamp = as.POSIXct(AllData$order_purchase_timestamp, format="%Y-%m-%d %H:%M:%S")
AllData=mutate(AllData,weekday=format(AllData$order_purchase_timestamp,'%u'))
AllData$weekday_class <- ifelse(AllData$weekday %in% c("6", "7"), yes = "weekend", no = "weekday")

2. 平日、假日購買金額、訂單量比較

- 先統計平日與假日的訂單筆數

- 繪製週一到週日的個別訂單量


#統計平日與假日的訂單筆數
table(AllData$weekday) %>% sort(., decreasing = T)
## 
##     1     2     3     4     5     7     6 
## 18141 17966 17343 16557 15817 13213 11986
ggplot(AllData,aes(x=weekday)) + 
  geom_bar(fill = "#377EB8")

- 繪製平日與假日的訂單量比較圖


ggplot(AllData,aes(x=weekday_class,fill=weekday)) + 
  geom_bar()

- 繪製週一到週日的個別購買金額圖


ggplot(AllData,aes(x=weekday, y=price)) + 
   geom_bar(stat = "identity" ,fill = "#377EB8")

3. 熱門購買時段分析

- 統計星期一到星期天每個時段的訂單筆數並繪製成熱圖


table(format(AllData$order_purchase_timestamp,"%u"), format(AllData$order_purchase_timestamp,"%H")) %>% 
  as.data.frame.matrix %>% 
  d3heatmap(F,F,col=colorRamp(c('lightyellow','wheat','orangered')))

- 觀察星期一的熱門購買時段


tapply(AllData$weekday=="1", format(AllData$order_purchase_timestamp,"%H"), sum ) %>%sort (.,decreasing = T) %>% head(5)
##   14   16   15   21   11 
## 1268 1248 1244 1219 1209

- 觀察星期二的熱門購買時段


tapply(AllData$weekday=="2", format(AllData$order_purchase_timestamp,"%H"), sum ) %>%sort (.,decreasing = T) %>% head(5)
##   14   16   13   11   15 
## 1282 1247 1204 1186 1172

4. 商品種類購買分析

- 統計每個小時所販售的商品種類數量


Time=format(AllData$order_purchase_timestamp,"%H")
ggplot(AllData,aes(Time ,fill=product_category_name_english)) + 
geom_bar(position = "fill")

- 以直方圖分析星期一 14:00 與 16:00 的熱銷商品


H=format(AllData$order_purchase_timestamp,"%H")
tapply(AllData$weekday=="1" & H=="14" ,AllData$product_category_name_english, sum)%>%sort (.,decreasing = T) %>% head(3)  %>% barplot(main="週一14:00最熱銷的產品",ylim=c(0,160), col="orange", family="STKaiti")

H=format(AllData$order_purchase_timestamp,"%H")
tapply(AllData$weekday=="1" & H=="16" ,AllData$product_category_name_english, sum)%>%sort (.,decreasing = T) %>% head(3)  %>% barplot(main="週一16:00最熱銷的產品",ylim=c(0,160), col="yellow", family="STKaiti")

- 以直方圖分析星期二 14:00 與 16:00 的熱銷商品


H=format(AllData$order_purchase_timestamp,"%H")
tapply(AllData$weekday=="2" & H=="14" ,AllData$product_category_name_english, sum)%>%sort (.,decreasing = T) %>% head(3)  %>% barplot(main="週二14:00最熱銷的產品",ylim=c(0,160), col="orange", family="STKaiti")

H=format(AllData$order_purchase_timestamp,"%H")
tapply(AllData$weekday=="2" & H=="16" ,AllData$product_category_name_english, sum)%>%sort (.,decreasing = T) %>% head(3)  %>% barplot(main="週二16:00最熱銷的產品",ylim=c(0,160), col="yellow", family="STKaiti")


【Part V】下訂時間與品項分析


0. 將備用資料取出來,初始化 data.frame

Customers <- C2
Geolocation <- G2
Items <- I2
Payments <- PA2
Reviews <- R2
Orders <- O2
Products <- PR2
Sellers <- S2
Trans <- T2

1. 資料整理與 data.frame 合併

- 依照 product_category_name 對照到 Products的data frame 中並合併起來

- 依照 product_id 對照 Items 的 data frame 中並合併起來

- 依照 order_id 對照 new_Items 的 data frame 中並合併起來


Products = full_join(Products,Trans, by="product_category_name") %>% .[,-2]
new_Items = full_join(Items,Products, by="product_id") 
new_Orders = full_join(new_Items,Orders, by="order_id") 

- 在 Total 中加入整理好的時間序列並依照小時為格式

- 把變數的名稱改短


Total = mutate(new_Orders,time=format(as.POSIXct(new_Orders$order_purchase_timestamp),"%H")) 
colnames(Total)[colnames(Total)=="product_category_name_english"] <- "categories" 

- 把所有的品項依照前面的名字歸類在一起

- . = 全部字元,* = 0 到無限多個,$ = 字串結尾,\1 = 前面 pattern 第一個()的內容

- 並在完成後把 categories 變為類別變數


Total$categories= gsub("(art).*$", "\\1", Total$categories) %>% 
                  gsub("(books).*$", "\\1", .) %>%
                  gsub("(computers).*$", "\\1", .) %>%
                  gsub("(construction).*$", "\\1", .) %>%
                  gsub("(costruction).*$", "construction", .) %>%
                  gsub("(fashio).*$", "fashion", .) %>%
                  gsub("(food).*$", "\\1", .) %>%
                  gsub("(furniture).*$", "\\1", .) %>%
                  gsub("(home).*$", "\\1", .) %>%
                  gsub("(music).*$", "\\1", .) %>%
                  gsub("(small_appliances).*$", "\\1", .) 

Total$categories = as.factor(Total$categories) 

- 更改時間資料格式為 POSIXct 格式,把時間獨立出來並取其小時


time = as.POSIXct(new_Orders$order_purchase_timestamp, format="%H") 

2. 繪製各時段各種類商品訂單數柱狀圖

- 篩選成交量較多的種類


Total2 =  filter(Total, categories %in% c("bed_bath_table","health_beauty","computers","housewares","furniture",
                                          "sports_leisure","watches_gifts","garden_tools","telephony","toys","auto"
                                          ,"cool_stuff")) 

- 篩選過後的成果繪圖

- time 作為 X 軸,categories 作為分類並把分類作為數字來排序

- 把所有的標題命名

- y 軸使用數量來計算

- 把個別的品項的數字呈現出來位置設定在 0.5

- 把 legend 設定在右邊、字體設定為 8、圖示的大小設定為 0.5 並以線條方式呈現、標題字體設定為 8

- 更改 legend 顏色


ggplot(Total2, aes(x=time ,fill=categories, order = as.numeric(categories))) + 
  labs(title="Count Vs Hours", x="Hours",caption="Source:Olist",fill="Categories") + 
  geom_bar(stat="count") + 
  geom_text(aes(label=..count..),stat="count",position=position_stack(0.5))+ 
  theme(legend.position = "right", 
        legend.text=element_text(size=8), 
        legend.key.size = unit(0.5,"line"), 
        legend.title = element_text(size=8)) +
  guides(fill=guide_legend(ncol=1),colour = guide_colourbar(order = 7))

- 另附上未經篩選過的成果繪圖

- time 作為 X 軸,categories 作為分類並把分類作為數字來排序

- 把所有的標題命名

- y 軸使用數量來計算

- 把個別的品項的數字呈現出來位置設定在 0.5

- 把 legend 設定在右邊、字體設定為 8、圖示的大小設定為 0.5 並以線條方式呈現、標題字體設定為 8

- 更改 legend 顏色


ggplot(Total, aes(x=time ,fill=categories, order = as.numeric(categories))) +
  labs(title="Count Vs Hours", x="Hours",caption="Source:Olist",fill="Categories") +
  geom_bar(stat="count") + 
  theme(legend.position = "right", 
        legend.text=element_text(size=8), 
        legend.key.size = unit(0.5,"line"), 
        legend.title = element_text(size=8)) + 
  guides(fill=guide_legend(ncol=1),colour = guide_colourbar(order = 7)) 


【附錄】下單至購買被確認的等待時間對上訂單金額的關係


- 由於效果不顯著因此沒有放進報告之中

ggplot(UniqueOrders, aes(x = waiting_time, y = Total_payments)) +
  geom_point(alpha = 0.15)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

ggplot(UniqueOrders, aes(x = waiting_time, y = Total_payments)) +
  geom_point(alpha = 0.15) +
  scale_x_continuous(limits = c(0,70)) +
  scale_y_continuous(limits = c(0,7500))

ggplot(UniqueOrders, aes(x = waiting_time, y = Total_payments)) +
  geom_point(alpha = 0.15) +
  scale_x_continuous(limits = c(0,20)) +
  scale_y_continuous(limits = c(0,4000))

ggplot(UniqueOrders, aes(x = waiting_time, y = Total_payments)) +
  geom_point(alpha = 0.05) +
  scale_x_continuous(limits = c(0,20)) +
  scale_y_continuous(limits = c(0,4000))

UniqueOrders$waiting_time_num <- as.numeric(UniqueOrders$waiting_time)

lm1 <- lm(log(Total_payments) ~ waiting_time_num, data = UniqueOrders)
summary(lm1)
## 
## Call:
## lm(formula = log(Total_payments) ~ waiting_time_num, data = UniqueOrders)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4317 -0.5673 -0.0359  0.4821  4.8218 
## 
## Coefficients:
##                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)      4.692448   0.002771 1693.527   <2e-16 ***
## waiting_time_num 0.002767   0.002371    1.167    0.243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8106 on 99278 degrees of freedom
##   (161 observations deleted due to missingness)
## Multiple R-squared:  1.371e-05,  Adjusted R-squared:  3.641e-06 
## F-statistic: 1.361 on 1 and 99278 DF,  p-value: 0.2433
ggplot(UniqueOrders, aes(x = waiting_time_num)) +
  geom_histogram(bins = 1000) +
  scale_x_continuous(limits = c(0,5))

## 篩選

UniqueOrders_Delete_No_Waits <- UniqueOrders[(UniqueOrders$waiting_time_num >= 1),]

ggplot(UniqueOrders_Delete_No_Waits, aes(x = waiting_time, y = Total_payments)) +
  geom_point(alpha = 0.05) +
  scale_x_continuous(limits = c(0,70)) +
  scale_y_continuous(limits = c(0,7500))

lm2 <- lm(log(Total_payments) ~ waiting_time_num, data = UniqueOrders_Delete_No_Waits)
summary(lm2)
## 
## Call:
## lm(formula = log(Total_payments) ~ waiting_time_num, data = UniqueOrders_Delete_No_Waits)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3371 -0.6093 -0.0566  0.4914  4.8181 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       4.738860   0.009260  511.74  < 2e-16 ***
## waiting_time_num -0.011484   0.003397   -3.38 0.000726 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8656 on 17418 degrees of freedom
##   (160 observations deleted due to missingness)
## Multiple R-squared:  0.0006555,  Adjusted R-squared:  0.0005982 
## F-statistic: 11.43 on 1 and 17418 DF,  p-value: 0.0007259